require(readr)
## Loading required package: readr
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(stringr)
## Loading required package: stringr
require(zipcode)
## Loading required package: zipcode
## Warning in library(package, lib.loc = lib.loc, character.only = TRUE,
## logical.return = TRUE, : there is no package called 'zipcode'
require(tidyverse)
## Loading required package: tidyverse
## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: purrr
## Conflicts with tidy packages ----------------------------------------------
## filter(): dplyr, stats
## lag(): dplyr, stats
require(ggmap)
## Loading required package: ggmap
require(lubridate)
## Loading required package: lubridate
## Warning: package 'lubridate' was built under R version 3.4.2
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
require(viridis)
## Loading required package: viridis
## Loading required package: viridisLite
require(ggthemes)
## Loading required package: ggthemes
require(plotly)
## Loading required package: plotly
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggmap':
##
## wind
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
require(reshape2)
## Loading required package: reshape2
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
nypd.vehicle <- read_csv("NYPD_Motor_Vehicle_Collisions.csv", col_names = T)
## Parsed with column specification:
## cols(
## .default = col_character(),
## TIME = col_time(format = ""),
## `ZIP CODE` = col_integer(),
## LATITUDE = col_double(),
## LONGITUDE = col_double(),
## `NUMBER OF PERSONS INJURED` = col_integer(),
## `NUMBER OF PERSONS KILLED` = col_integer(),
## `NUMBER OF PEDESTRIANS INJURED` = col_integer(),
## `NUMBER OF PEDESTRIANS KILLED` = col_integer(),
## `NUMBER OF CYCLIST INJURED` = col_integer(),
## `NUMBER OF CYCLIST KILLED` = col_integer(),
## `NUMBER OF MOTORIST INJURED` = col_integer(),
## `NUMBER OF MOTORIST KILLED` = col_integer(),
## `UNIQUE KEY` = col_integer()
## )
## See spec(...) for full column specifications.
summary(nypd.vehicle)
## DATE TIME BOROUGH ZIP CODE
## Length:1152443 Length:1152443 Length:1152443 Min. :10000
## Class :character Class1:hms Class :character 1st Qu.:10128
## Mode :character Class2:difftime Mode :character Median :11205
## Mode :numeric Mean :10812
## 3rd Qu.:11236
## Max. :11697
## NA's :323606
## LATITUDE LONGITUDE LOCATION ON STREET NAME
## Min. : 0.00 Min. :-201.36 Length:1152443 Length:1152443
## 1st Qu.:40.67 1st Qu.: -73.98 Class :character Class :character
## Median :40.72 Median : -73.93 Mode :character Mode :character
## Mean :40.72 Mean : -73.92
## 3rd Qu.:40.77 3rd Qu.: -73.87
## Max. :41.13 Max. : 0.00
## NA's :211564 NA's :211564
## CROSS STREET NAME OFF STREET NAME NUMBER OF PERSONS INJURED
## Length:1152443 Length:1152443 Min. : 0.0000
## Class :character Class :character 1st Qu.: 0.0000
## Mode :character Mode :character Median : 0.0000
## Mean : 0.2567
## 3rd Qu.: 0.0000
## Max. :43.0000
##
## NUMBER OF PERSONS KILLED NUMBER OF PEDESTRIANS INJURED
## Min. :0.000000 Min. : 0.00000
## 1st Qu.:0.000000 1st Qu.: 0.00000
## Median :0.000000 Median : 0.00000
## Mean :0.001201 Mean : 0.05209
## 3rd Qu.:0.000000 3rd Qu.: 0.00000
## Max. :8.000000 Max. :28.00000
##
## NUMBER OF PEDESTRIANS KILLED NUMBER OF CYCLIST INJURED
## Min. :0.000000 Min. :0.00000
## 1st Qu.:0.000000 1st Qu.:0.00000
## Median :0.000000 Median :0.00000
## Mean :0.000664 Mean :0.02078
## 3rd Qu.:0.000000 3rd Qu.:0.00000
## Max. :8.000000 Max. :4.00000
##
## NUMBER OF CYCLIST KILLED NUMBER OF MOTORIST INJURED
## Min. :0.00e+00 Min. : 0.0000
## 1st Qu.:0.00e+00 1st Qu.: 0.0000
## Median :0.00e+00 Median : 0.0000
## Mean :7.98e-05 Mean : 0.1852
## 3rd Qu.:0.00e+00 3rd Qu.: 0.0000
## Max. :2.00e+00 Max. :43.0000
##
## NUMBER OF MOTORIST KILLED CONTRIBUTING FACTOR VEHICLE 1
## Min. :0.000000 Length:1152443
## 1st Qu.:0.000000 Class :character
## Median :0.000000 Mode :character
## Mean :0.000458
## 3rd Qu.:0.000000
## Max. :5.000000
##
## CONTRIBUTING FACTOR VEHICLE 2 CONTRIBUTING FACTOR VEHICLE 3
## Length:1152443 Length:1152443
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## CONTRIBUTING FACTOR VEHICLE 4 CONTRIBUTING FACTOR VEHICLE 5
## Length:1152443 Length:1152443
## Class :character Class :character
## Mode :character Mode :character
##
##
##
##
## UNIQUE KEY VEHICLE TYPE CODE 1 VEHICLE TYPE CODE 2
## Min. : 22 Length:1152443 Length:1152443
## 1st Qu.: 290312 Class :character Class :character
## Median :3212342 Mode :character Mode :character
## Mean :2286022
## 3rd Qu.:3500480
## Max. :3789579
##
## VEHICLE TYPE CODE 3 VEHICLE TYPE CODE 4 VEHICLE TYPE CODE 5
## Length:1152443 Length:1152443 Length:1152443
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
##
First, we mutate two new variables Num_Injured and Num_Killed which are the sum of all injured or killed people (people in the vehicles, pedestrians, cyclists, motorists) and then group them by cross street. We then summarise by number of accidents, number of injuries and number of deaths for each of the cross streets.
colnames(nypd.vehicle) <- str_replace_all(colnames(nypd.vehicle), " ", "_")
q1 <- nypd.vehicle %>%
mutate(num = 1,
Num_Injured = NUMBER_OF_PERSONS_INJURED + NUMBER_OF_PEDESTRIANS_INJURED + NUMBER_OF_CYCLIST_INJURED + NUMBER_OF_MOTORIST_INJURED,
Num_Killed = NUMBER_OF_PERSONS_KILLED + NUMBER_OF_PEDESTRIANS_KILLED + NUMBER_OF_CYCLIST_KILLED + NUMBER_OF_MOTORIST_KILLED
) %>%
filter(!is.na(CROSS_STREET_NAME)) %>%
group_by(CROSS_STREET_NAME) %>%
summarise(Number_of_Accidents = n(),
Num_Injured = sum(Num_Injured),
Num_Killed = sum(Num_Killed))
We use three different criteria to determine the most dangerous/safest intersections based on Number_of_Accidents, Num_Injured, and Num_Killed. 3rd ave was the most dangerous in terms of accidents and Broadway in terms of number injured and killed. We created tables for the safest and most dangerous intersections, using max of accidents and min of accidents, respectively. Non-busy places such as precincts have the lowest number of non-zero accidents.
knitr::kable(q1[q1$Number_of_Accidents == max(q1$Number_of_Accidents), ],
caption = "Most dangerous NYC intersections")
| CROSS_STREET_NAME | Number_of_Accidents | Num_Injured | Num_Killed |
|---|---|---|---|
| 3 AVENUE | 12116 | 4547 | 10 |
knitr::kable(q1[q1$Num_Injured == max(q1$Num_Injured), ],
caption = "Most dangerous NYC intersections")
| CROSS_STREET_NAME | Number_of_Accidents | Num_Injured | Num_Killed |
|---|---|---|---|
| BROADWAY | 11871 | 4714 | 28 |
knitr::kable(q1[q1$Num_Killed == max(q1$Num_Killed), ],
caption = "Most dangerous NYC intersections")
| CROSS_STREET_NAME | Number_of_Accidents | Num_Injured | Num_Killed |
|---|---|---|---|
| BROADWAY | 11871 | 4714 | 28 |
safe <- q1[q1$Number_of_Accidents == min(q1$Number_of_Accidents) & q1$Num_Killed == 0 &
q1$Num_Injured == 0,]
knitr::kable(head(safe), caption = "Safest NYC intersections")
| CROSS_STREET_NAME | Number_of_Accidents | Num_Injured | Num_Killed |
|---|---|---|---|
| 0 | 1 | 0 | 0 |
| 043 PCT | 1 | 0 | 0 |
| 1 ave | 1 | 0 | 0 |
| 1 Avenue | 1 | 0 | 0 |
| 1 AVENUE + 2 AVENUE | 1 | 0 | 0 |
| 1 AVENUE TO PLEASANT AVENUE | 1 | 0 | 0 |
We then made 2 bar graphs out of this to plot the top 5 most dangerous and safest NYC cross streets, each broken down into 3 bars by Number_of_Accidents, Num_Injured, and Num_Killed. We did this through the use of gather and dodge positioning.
ggplot(data = q1 %>% arrange(-Number_of_Accidents) %>% top_n(5,Number_of_Accidents) %>%
gather("Number_of_Accidents", "Num_Injured", "Num_Killed",key = Type, value = Amount),
aes(x = CROSS_STREET_NAME, y = Amount, fill = Type)) +
geom_col(position = "dodge") +
ggtitle("Most dangerous NYC intersections") + xlab("Cross street") + ylab("number") +
theme(axis.text.x = element_text(angle = 0),
text = element_text(size = 10, hjust=0.5 , vjust = 1, face = "bold"))
From the plot, we can direcly find that the intersection which has the greatest number of injured people is Broadway and the one which has the greatest number of accidents is 3 Avenue, but the number of killed people is hard to see because of its scale relative to the accident and injured quantity. If you look closely though, Broadway definitely has the thickest and most visible green mark for number killed.
ggplot(data = q1 %>% arrange(Number_of_Accidents) %>% head(5) %>%
gather("Number_of_Accidents", "Num_Injured", "Num_Killed",key = Type, value = Amount),
aes(x = CROSS_STREET_NAME, y = Amount, fill = Type)) +
geom_col(position = "dodge") +
ggtitle("Safest NYC intersections") + xlab("Cross street") + ylab("number") +
theme(axis.text.x = element_text(angle = 0),
text = element_text(size = 10, hjust=0.5, vjust = 1, face = "bold"))
From the plot, we can direcly find that the intersections which have the least number of injured people is 0, 043 PCT and 1 Avenue, and all of the top 5 safest intersections have "1" for number of accidents and "0" for the number of killed people.
From the plot of the most dangerous intersections of NYC, we noticed that the number of killed people is to small to see from the plot. We use ggmap to get the google map of NYC and figure out the distribution of people killed based on the map of NYC and utilizing the latitude and longitude coordinates. We created a data frame that gave the lat. and long. for each accident in which someone was killed, by omitting ones in which no one was killed with filter.
colnames(nypd.vehicle) <- str_replace_all(colnames(nypd.vehicle), " ", "_")
nypd <- nypd.vehicle %>%
select(LATITUDE,LONGITUDE,NUMBER_OF_PERSONS_KILLED) %>%
gather(type,value,3:3) %>%
na.omit() %>%
group_by(LATITUDE,LONGITUDE,type) %>%
summarise(total=sum(value,na.rm=T)) %>%
filter(total!=0)
nypd
## # A tibble: 996 x 4
## # Groups: LATITUDE, LONGITUDE [996]
## LATITUDE LONGITUDE type total
## <dbl> <dbl> <chr> <int>
## 1 40.50619 -74.23489 NUMBER_OF_PERSONS_KILLED 1
## 2 40.51387 -74.23668 NUMBER_OF_PERSONS_KILLED 1
## 3 40.51662 -74.20279 NUMBER_OF_PERSONS_KILLED 1
## 4 40.52400 -74.18610 NUMBER_OF_PERSONS_KILLED 1
## 5 40.52664 -74.16813 NUMBER_OF_PERSONS_KILLED 1
## 6 40.52937 -74.16135 NUMBER_OF_PERSONS_KILLED 2
## 7 40.53078 -74.22443 NUMBER_OF_PERSONS_KILLED 2
## 8 40.53427 -74.15397 NUMBER_OF_PERSONS_KILLED 2
## 9 40.53866 -74.18800 NUMBER_OF_PERSONS_KILLED 1
## 10 40.54101 -74.14710 NUMBER_OF_PERSONS_KILLED 1
## # ... with 986 more rows
nyc <- get_map("new york",zoom=11)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=new+york&zoom=11&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=new%20york&sensor=false
map_killed <- ggmap(nyc)+geom_point(data=subset(nypd,type=="NUMBER_OF_PERSONS_KILLED"),
aes(x=LONGITUDE, y=LATITUDE, colour=total),size= 2,alpha=0.2) +
ggtitle("Distribution of deaths")+xlab("Longitude")+ylab("Latitude") +
scale_color_gradient(low = "red", high = "black")
map_killed
## Warning: Removed 125 rows containing missing values (geom_point).
From the map, persons who got killed in the collisions are relatively few compared to persons who got injured. Most of the collisions involving death of people took place in Manhattan, Brooklyn and Queens and then Bronx, while Staten island has the least number of death of people.
We combined the number of people injured and the number of people killed to figure out the proportion of crashes that resulted in an injury or death. We got the proportion by taking the mean of the conditional statement of whether someone got injured or killed.
q4 <- nypd.vehicle %>%
mutate(Injured_or_Killed = NUMBER_OF_PERSONS_INJURED + NUMBER_OF_PEDESTRIANS_INJURED + NUMBER_OF_CYCLIST_INJURED + NUMBER_OF_MOTORIST_INJURED +NUMBER_OF_PERSONS_KILLED + NUMBER_OF_PEDESTRIANS_KILLED + NUMBER_OF_CYCLIST_KILLED + NUMBER_OF_MOTORIST_KILLED
) %>%
mutate(Injured_or_Killed_Yes = ifelse(Injured_or_Killed > 0, 1, 0))
prop <- mean(q4$Injured_or_Killed_Yes)
prop
## [1] 0.1886957
It was around 19% of people that get injured or killed per crash.
In order to look at how the average number of accidents varied over time within each of the 5 boroughs throughout the years of the dataset, we filtered the data by the borough variable and then used group_by to organize by the date of the year and borough and summarised an average accidents variable that was the mean of the number of accidents per month in that borough at each of those moments in time. We did this with a geom_line to track the trends and ebbs and flows for each and grouped by the date it occured and the borough it occurred in and summarise the average collisions per month for those groups.
nypd.vehicle$DATE <- mdy(nypd.vehicle$DATE)
borough.accident <- nypd.vehicle %>% filter(BOROUGH!="") %>% group_by(DATE,BOROUGH) %>% summarise(Average_CollisionsPerMonth = mean(n())) %>% na.omit()
ggplot(data=borough.accident,aes(x=DATE, y=Average_CollisionsPerMonth, colour=BOROUGH, group=BOROUGH)) + geom_line()+geom_point(size=0.5,shape=1) + ggtitle("Average Accidents per month by Borough")+geom_text(aes(label=ifelse(Average_CollisionsPerMonth>180,Average_CollisionsPerMonth,"")), size=2,hjust=1.5)
The result seems quite obvious.
The overall ranking of the boroughs from the most accidents happened to the least is:
1. Brooklyn
2. Queens
3. Manhattan
4. Bronx
5. Staten Island
In order to find the type of vehicles (pickup truck, passenger, taxi, etc.) most frequently caused a collision, we add the two vehicle types together and sort the 10 most common types of vehicles based on the number of collisions. We did this by creating a union of vehicle_1 and vehicle_2 from q5.1 and q5.2, as there are usually 2 vehicles involved in a collision.
q5.1 <- nypd.vehicle %>%
rename(VEHICLE_TYPE = VEHICLE_TYPE_CODE_1) %>%
group_by(VEHICLE_TYPE) %>%
summarise(Number_of_Accidents = n()) %>%
mutate(Type = "VEHICLE_TYPE_CODE_1")
q5.2 <- nypd.vehicle %>%
rename(VEHICLE_TYPE = VEHICLE_TYPE_CODE_2) %>%
group_by(VEHICLE_TYPE) %>%
summarise(Number_of_Accidents = n()) %>%
mutate(Type = "VEHICLE_TYPE_CODE_2")
q2 <- union_all(q5.1, q5.2) %>%
filter(!VEHICLE_TYPE %in% c("UNKNOWN", "OTHER", NA)) %>%
group_by(Type) %>%
arrange(-Number_of_Accidents) %>%
top_n(10,Number_of_Accidents)
knitr::kable(q2, caption = "Number of Collisions by Vehicle Type")
| VEHICLE_TYPE | Number_of_Accidents | Type |
|---|---|---|
| PASSENGER VEHICLE | 662184 | VEHICLE_TYPE_CODE_1 |
| PASSENGER VEHICLE | 496070 | VEHICLE_TYPE_CODE_2 |
| SPORT UTILITY / STATION WAGON | 274688 | VEHICLE_TYPE_CODE_1 |
| SPORT UTILITY / STATION WAGON | 205959 | VEHICLE_TYPE_CODE_2 |
| TAXI | 44544 | VEHICLE_TYPE_CODE_1 |
| TAXI | 36640 | VEHICLE_TYPE_CODE_2 |
| VAN | 26846 | VEHICLE_TYPE_CODE_1 |
| BICYCLE | 25180 | VEHICLE_TYPE_CODE_2 |
| VAN | 23905 | VEHICLE_TYPE_CODE_2 |
| PICK-UP TRUCK | 20446 | VEHICLE_TYPE_CODE_1 |
| PICK-UP TRUCK | 17816 | VEHICLE_TYPE_CODE_2 |
| SMALL COM VEH(4 TIRES) | 14874 | VEHICLE_TYPE_CODE_1 |
| SMALL COM VEH(4 TIRES) | 14627 | VEHICLE_TYPE_CODE_2 |
| LARGE COM VEH(6 OR MORE TIRES) | 14117 | VEHICLE_TYPE_CODE_1 |
| BUS | 14044 | VEHICLE_TYPE_CODE_1 |
| LARGE COM VEH(6 OR MORE TIRES) | 13367 | VEHICLE_TYPE_CODE_2 |
| BUS | 11460 | VEHICLE_TYPE_CODE_2 |
| LIVERY VEHICLE | 9615 | VEHICLE_TYPE_CODE_1 |
| LIVERY VEHICLE | 7704 | VEHICLE_TYPE_CODE_2 |
| MOTORCYCLE | 6607 | VEHICLE_TYPE_CODE_1 |
We then created a bar graph that illustrated the 10 most frequent car types that got into a collision, broken down by the 2 vehicle codes and how many collisions there were involving those vehicle types.
ggplot(data = q2,
aes(x = VEHICLE_TYPE, y = Number_of_Accidents, fill = Type)) +
geom_col(position = "stack") +
ggtitle("Number of Collisions by Vehicle Type") + ylab("Number of collisions") + xlab("Vehicle_Type") +
theme(axis.text.x = element_text(angle = 0),
text = element_text(size = 10, hjust=0.5, vjust = 1, face = "bold")) + coord_flip()
From the plot, passanger vehicles are involved in much more accidents than any other vehicles. Sport utility/station wagon come in second, involved in less accidents than the passanger vehicles but still a lot relative to even the third place vehicle. The rest seems similar after taxis.
Then we try to find the most common contributing factors leading to a collision. We group together the factors similar to what we did with the vehicle types for vehicle 1 and vehicle 2 in the accident and show the top ten factors that led to a crash.
q3.1 <- nypd.vehicle %>%
rename(CONTRIBUTING_FACTOR = CONTRIBUTING_FACTOR_VEHICLE_1) %>%
group_by(CONTRIBUTING_FACTOR) %>%
summarise(Number_of_Accidents = n()) %>%
mutate(Type = "CONTRIBUTING_FACTOR_VEHICLE_1")
q3.2 <- nypd.vehicle %>%
rename(CONTRIBUTING_FACTOR = CONTRIBUTING_FACTOR_VEHICLE_2) %>%
group_by(CONTRIBUTING_FACTOR) %>%
summarise(Number_of_Accidents = n()) %>%
mutate(Type = "CONTRIBUTING_FACTOR_VEHICLE_2")
q3 <- union_all(q3.1, q3.2) %>%
filter(!CONTRIBUTING_FACTOR %in% c("UNKNOWN", "Unspecified", NA)) %>%
group_by(Type) %>%
arrange(-Number_of_Accidents) %>%
top_n(10,Number_of_Accidents)
knitr::kable(q3, caption = "Number of Collisions for each Contributing Factor")
| CONTRIBUTING_FACTOR | Number_of_Accidents | Type |
|---|---|---|
| Driver Inattention/Distraction | 166389 | CONTRIBUTING_FACTOR_VEHICLE_1 |
| Failure to Yield Right-of-Way | 54010 | CONTRIBUTING_FACTOR_VEHICLE_1 |
| Fatigued/Drowsy | 48457 | CONTRIBUTING_FACTOR_VEHICLE_1 |
| Driver Inattention/Distraction | 46476 | CONTRIBUTING_FACTOR_VEHICLE_2 |
| Backing Unsafely | 35893 | CONTRIBUTING_FACTOR_VEHICLE_1 |
| Other Vehicular | 34846 | CONTRIBUTING_FACTOR_VEHICLE_1 |
| Turning Improperly | 28209 | CONTRIBUTING_FACTOR_VEHICLE_1 |
| Following Too Closely | 27306 | CONTRIBUTING_FACTOR_VEHICLE_1 |
| Lost Consciousness | 20232 | CONTRIBUTING_FACTOR_VEHICLE_1 |
| Other Vehicular | 20047 | CONTRIBUTING_FACTOR_VEHICLE_2 |
| Prescription Medication | 15900 | CONTRIBUTING_FACTOR_VEHICLE_1 |
| Traffic Control Disregarded | 15417 | CONTRIBUTING_FACTOR_VEHICLE_1 |
| Fatigued/Drowsy | 13027 | CONTRIBUTING_FACTOR_VEHICLE_2 |
| Failure to Yield Right-of-Way | 10393 | CONTRIBUTING_FACTOR_VEHICLE_2 |
| Turning Improperly | 6620 | CONTRIBUTING_FACTOR_VEHICLE_2 |
| Lost Consciousness | 6344 | CONTRIBUTING_FACTOR_VEHICLE_2 |
| Backing Unsafely | 5674 | CONTRIBUTING_FACTOR_VEHICLE_2 |
| Driver Inexperience | 4326 | CONTRIBUTING_FACTOR_VEHICLE_2 |
| Following Too Closely | 4019 | CONTRIBUTING_FACTOR_VEHICLE_2 |
| Traffic Control Disregarded | 3831 | CONTRIBUTING_FACTOR_VEHICLE_2 |
We then created a bar graph that illustrated the 10 most popular contributing factors that got into a collision, broken down by the 2 vehicle codes and the amount of crashes these factors were involved in.
ggplot(data = q3,
aes(x = CONTRIBUTING_FACTOR, y = Number_of_Accidents, fill = Type)) +
geom_col(position = "stack") +
ggtitle("Number of Collisions by Contributing Factor") + xlab("Contributing Factor") +
theme(axis.text.x = element_text(angle = 0),
text = element_text(size = 10, hjust=0.5, vjust = 1, face = "bold")) + coord_flip()
From the result, driver inattention/distraction is the most common contributing factors that caused collisions. This is because driver inattention can be looking off the road for a second, texting and driving, eating, etc. We would think failure to yield the right of way would be more common in a city where everyone is in a hurry.
In order to figure out certain time periods that the collision occured more frequently, we split one day into 24 hours and count the number of collisions happened in each time interval.
# 60 minute
q5.1 <- nypd.vehicle %>%
group_by(TIME) %>%
summarise(n = n()) %>%
mutate(time = hms(TIME),
time = hour(time)*3600 + minute(time)*60 + second(time),
times = ceiling(time/3600),
time.end = time/3600
)
q5.interval <- q5.1 %>%
filter(times == time.end) %>%
union_all(q5.1 %>% top_n(1)) %>%
select(times, TIME)
## Selecting by time.end
q5.interval <- q5.interval %>%
rename(start_time = TIME) %>%
inner_join(
q5.interval %>%
rename(end_time = TIME) %>%
mutate(times = times - 1)
)
## Joining, by = "times"
q5.2 <- q5.1 %>%
group_by(times) %>%
summarise(n = sum(n)) %>%
inner_join(q5.interval) %>%
arrange(-n) %>%
mutate(Collisions = n) %>%
select(start_time, end_time, Collisions) %>%
mutate(time_interval = paste0(start_time, "->", end_time)) %>%
arrange(start_time)
## Joining, by = "times"
knitr::kable(q5.2, caption = "Most frequent 1-hour intervals for collisions to occur")
| start_time | end_time | Collisions | time_interval |
|---|---|---|---|
| 00:00:00 | 01:00:00 | 4683 | 00:00:00->01:00:00 |
| 01:00:00 | 02:00:00 | 28900 | 01:00:00->02:00:00 |
| 02:00:00 | 03:00:00 | 17467 | 02:00:00->03:00:00 |
| 03:00:00 | 04:00:00 | 13518 | 03:00:00->04:00:00 |
| 04:00:00 | 05:00:00 | 11826 | 04:00:00->05:00:00 |
| 05:00:00 | 06:00:00 | 14170 | 05:00:00->06:00:00 |
| 06:00:00 | 07:00:00 | 15668 | 06:00:00->07:00:00 |
| 07:00:00 | 08:00:00 | 25501 | 07:00:00->08:00:00 |
| 08:00:00 | 09:00:00 | 37082 | 08:00:00->09:00:00 |
| 09:00:00 | 10:00:00 | 66095 | 09:00:00->10:00:00 |
| 10:00:00 | 11:00:00 | 63088 | 10:00:00->11:00:00 |
| 11:00:00 | 12:00:00 | 59013 | 11:00:00->12:00:00 |
| 12:00:00 | 13:00:00 | 62297 | 12:00:00->13:00:00 |
| 13:00:00 | 14:00:00 | 66217 | 13:00:00->14:00:00 |
| 14:00:00 | 15:00:00 | 69425 | 14:00:00->15:00:00 |
| 15:00:00 | 16:00:00 | 80642 | 15:00:00->16:00:00 |
| 16:00:00 | 17:00:00 | 71858 | 16:00:00->17:00:00 |
| 17:00:00 | 18:00:00 | 86260 | 17:00:00->18:00:00 |
| 18:00:00 | 19:00:00 | 82322 | 18:00:00->19:00:00 |
| 19:00:00 | 20:00:00 | 70778 | 19:00:00->20:00:00 |
| 20:00:00 | 21:00:00 | 57371 | 20:00:00->21:00:00 |
| 21:00:00 | 22:00:00 | 47917 | 21:00:00->22:00:00 |
| 22:00:00 | 23:00:00 | 40058 | 22:00:00->23:00:00 |
| 23:00:00 | 23:59:00 | 36018 | 23:00:00->23:59:00 |
ggplot(data = q5.2,
aes(x = seq(0,23,1), y = Collisions, fill = time_interval)) +
geom_col(position = "stack") +
ggtitle("Collisions by Hour") + xlab("Time(hrs)") + ylab("Number of collisions")+
theme(axis.text.x = element_text(angle = 0),
text = element_text(hjust=0.5, vjust = 1, size = 10, face = "bold"),
legend.position="none")
From the plot, we found that the number of accidents rises suddenly between 8am and 9am and continues to rise through 3 pm. The maximum hour in terms of accidents was at 5pm.